library(tidyverse)
library(plyr)
library(ggplot2)
library(effects)
library(lmerTest)
Unigram_data <- read_csv2("Unigrams.csv")
Unigram_data <- Unigram_data[-which(Unigram_data$Speaker_ID == "Speaker_ID"),]
nrow(Unigram_data)
[1] 279153
Unigram_data$Time_s <- as.double(Unigram_data$Time_s)
Unigram_data$Abs_value <- as.double(Unigram_data$Abs_value)
Unigram_data$Age <- as.double(Unigram_data$Age)
Unigram_data$Sex <- as.factor(Unigram_data$Sex)
#The functions define the boundaries of unigrams
cut_unigram <- function(Abs_value){
unigrams <- cut(Abs_value, breaks = 3, labels = c(-1,0,1))
return(unigrams)
}
cut_unigram_raw <- function(Abs_value){
unigrams <- cut(Abs_value, breaks = 3)
return(unigrams)
}
# Ordering the data by speaker
Unigram_data <- Unigram_data[order(Unigram_data$Speaker_ID),]
# Normalizing (Z-scaling the) absolute pitch value
Unigram_data$zAbs_value <- ave(as.numeric(Unigram_data$Abs_value), Unigram_data$Speaker_ID, FUN=scale)
# Cutting the unigrams into intervals
zUnigram_cut <- with(Unigram_data, tapply(zAbs_value, Speaker_ID,cut_unigram_raw))
zUnigram_raw <- c()
#Attaching normalized dataand intervals to the other data
for (element in zUnigram_cut){
zUnigram_raw <- c(zUnigram_raw,as.character(element))
}
Unigram_data <- cbind(Unigram_data,zUnigram_raw)
#Counting z-normalized unigrams
zUnigram_cut <- with(Unigram_data, tapply(zAbs_value, Speaker_ID,cut_unigram))
zUnigram <- c()
for (element in zUnigram_cut){
zUnigram <- c(zUnigram,as.character(element))
}
Unigram_data <- cbind(Unigram_data,zUnigram)
Unigram_data$Unigram <- as.factor(Unigram_data$Unigram)
Unigram_data$zUnigram <- as.factor(Unigram_data$zUnigram)
#levels(Unigram_data$zUnigram)
#Calculating deltas and normalized deltas
zDelta <- diff(as.numeric(zUnigram))
zDelta <- c(NA,zDelta)
Unigram_data <- cbind(Unigram_data,zDelta)
Delta <- diff(as.numeric(Unigram_data$Unigram))
Delta <- c(NA,Delta)
#attaching the deltas to the data
Unigram_data <- cbind(Unigram_data,Delta)
Unigram_data$Delta <- as.factor(Unigram_data$Delta)
Unigram_data$zDelta <- as.factor(Unigram_data$zDelta)
# Creating unique ID's for each sentence
Unigram_data <- cbind(paste(Unigram_data$Speaker_ID,Unigram_data$Sentence),Unigram_data)
colnames(Unigram_data)[1] <- "ID"
Unigram_data$ID <- as.factor(Unigram_data$ID)
#Unigram_data$Unigram <- as.factor(Unigram_data$Unigram)
# Counting raw unigrams
df <- with(Unigram_data, tapply(Unigram, ID, plyr::count))
Raw_Ug_Counted <- ldply(df, data.frame)
Raw_Ug_Counted <- spread(Raw_Ug_Counted, x, freq)
colnames(Raw_Ug_Counted)[1] <- "ID"
Raw_Ug_Counted$Speaker_ID <- Unigram_data$Speaker_ID[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
Raw_Ug_Counted$Sex <- Unigram_data$Sex[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
Raw_Ug_Counted$Age <- Unigram_data$Age[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
Raw_Ug_Counted$Place <- Unigram_data$Place[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
Raw_Ug_Counted$Text <- Unigram_data$Text[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
Raw_Ug_Counted$Sentence <- Unigram_data$Sentence[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
#Counting raw deltas
bdf <- with(Unigram_data, tapply(Delta, ID, plyr::count), default = 0)
Raw_Dlt_Counted <- ldply(bdf, data.frame)
Raw_Dlt_Counted <- spread(Raw_Dlt_Counted, x, freq)
colnames(Raw_Dlt_Counted)[1] <- "ID"
Raw_Dlt_Counted$Speaker_ID <- Unigram_data$Speaker_ID[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
Raw_Dlt_Counted$Sex <- Unigram_data$Sex[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
Raw_Dlt_Counted$Age <- Unigram_data$Age[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
Raw_Dlt_Counted$Place <- Unigram_data$Place[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
Raw_Dlt_Counted$Text <- Unigram_data$Text[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
Raw_Dlt_Counted$Sentence <- Unigram_data$Sentence[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
#Counting z-normalised Unigrams
zdf <- with(Unigram_data, tapply(zUnigram, ID, plyr::count))
zUg_Counted <- ldply(zdf, data.frame)
zUg_Counted <- spread(zUg_Counted, x, freq)
zUg_Counted[is.na(zUg_Counted)] <- 0
colnames(zUg_Counted)[1] <- "ID"
zUg_Counted$Speaker_ID <- Unigram_data$Speaker_ID[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
zUg_Counted$Sex <- Unigram_data$Sex[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
zUg_Counted$Age <- Unigram_data$Age[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
zUg_Counted$Place <- Unigram_data$Place[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
zUg_Counted$Text <- Unigram_data$Text[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
zUg_Counted$Sentence <- Unigram_data$Sentence[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
#Counting z-normalized Deltas
zbdf <- with(Unigram_data, tapply(zDelta, ID, plyr::count))
zDlt_Counted <- ldply(zbdf, data.frame)
zDlt_Counted <- spread(zDlt_Counted, x, freq)
zDlt_Counted[is.na(zDlt_Counted)] <- 0
colnames(zDlt_Counted)[1] <- "ID"
zDlt_Counted$Speaker_ID <- Unigram_data$Speaker_ID[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
zDlt_Counted$Sex <- Unigram_data$Sex[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
zDlt_Counted$Age <- Unigram_data$Age[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
zDlt_Counted$Place <- Unigram_data$Place[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
zDlt_Counted$Text <- Unigram_data$Text[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
zDlt_Counted$Sentence <- Unigram_data$Sentence[match(Raw_Ug_Counted$ID, Unigram_data$ID)]
Figure 1. Pitch sub-ranges in a recording sample
Unigram_data$Time_s <- as.numeric(Unigram_data$Time_s)
Unigram_data$Abs_value <- as.numeric(Unigram_data$Abs_value)
Unigram_data$ID <- as.character(Unigram_data$ID)
subset(Unigram_data, Unigram_data$ID %in% c("AnnaSh 2")) %>%
ggplot(aes(Time_s, Abs_value))+
xlab("Time")+
ylab("Pitch Value")+
geom_hline(aes(yintercept=85.56109814999999), color = "red")+
geom_hline(aes(yintercept=220.06107974516), color = "blue")+
geom_hline(aes(yintercept=354.56106134032), color = "blue")+
geom_hline(aes(yintercept=489.06104293548003), color = "red")+
annotate(geom="text", x = 1,
y = 75.56109814999999, label = "Outliers", colour='red', size = 3) +
annotate(geom="text", x = 1,
y = 150, label = "-1", colour='blue', size = 4) +
annotate(geom="text", x = 1,
y = 300, label = "0", colour='blue', size = 4) +
annotate(geom="text", x = 1,
y = 450, label = "1", colour='blue', size = 4) +
annotate(geom="text", x = 1,
y = 498.06104293548003, label = "Outliers", colour='red', size = 3) +
#xlim(min(Unigram_data[Unigram_data$ID == "AnnaSh 10",]$Time_s), max(Unigram_data[Unigram_data$ID == "AnnaSh 10",]$Time_s))+
xlim(93.5,96.5)+
geom_line()+
#facet_wrap(. ~ ID)+
theme_bw() -> Fig1
Fig1
After the text was published, I noticed a small mistake in the group names. The age groups should be named “≤40” instead of “25-40” and “>40” instead of “45+”. The mistake follows from the data collection protocol that we developed at some point. Since the current case study is partly based on the data collected before the protocol was established, the speakers could not be divided into such groups and I used 40 years as the cut-off point, but the plot titles and the description in the text were not changed accordingly. The mistake does not affect the counts and modelling. This grouping is only used for convenience in barplots, where age cannot be treated as a numeric variable; the models treat in as numeric. In the two plots below, the mistake has been corrected.
Unigram_data <- mutate(Unigram_data, Age_Group = as.factor(ifelse(Unigram_data$Age <= 40,"25-40 (corrected: ≤ 40 )","45+ (corrected: >40)")))
Unigram_data$Place_1 <- factor(Unigram_data$Place, levels = c("Krasnoyarsk", "Nakhodka", "Novosibirsk", "Moscow" ))
subset(Unigram_data, !is.na(zUnigram)) %>%
ggplot(aes(zUnigram, color=Place, fill=Place))+
geom_bar(stat = "count", position="dodge")+
xlab("Distribution of unigrams by Place, Sex and Age")+
ylab("Number of unigrams of each type")+
facet_wrap(~Place_1:Sex:Age_Group) -> Fig2
Fig2
zhist <- subset(Unigram_data, !is.na(zDelta))
zhist <- zhist[zhist$zDelta != 0,]
subset(zhist, !is.na(zDelta)) %>%
ggplot(aes(zDelta, color=Place, fill=Place))+
geom_bar(stat = "count", position="dodge")+
xlab("Distribution of deltas by Place, Sex and Age")+
ylab("Number of deltas of each type")+
facet_wrap(~Place_1:Sex:Age_Group) -> Fig3
Fig3
data_ug <- zUg_Counted
row.names(data_ug) <- paste(data_ug$Speaker_ID,data_ug$Sentence)
data_ug$X1prop <- data_ug$"-1"/(data_ug$"0" + data_ug$"1" + data_ug$"-1" + 1)
data_ug <- mutate(data_ug, TextType = as.factor(ifelse(grepl("Experiment",Text),"dialogue","monologue")))
data_ug <- mutate(data_ug, Role = as.factor(ifelse(grepl("Follower",Text),"hearer","speaker")))
data_ug$Age_Group<-c("Low", "High")[
findInterval(data_ug$Age , c(-Inf, 40, Inf) ) ]
data_ug$Text <- as.factor(data_ug$Text)
data_ug$Sex <- as.factor(data_ug$Sex)
data_ug$Age_Group <- as.factor(data_ug$Age_Group)
data_dlt <- zDlt_Counted
row.names(data_dlt) <- paste(data_dlt$Speaker_ID,data_dlt$Sentence)
data_dlt$X0prop <- (data_dlt$"0")/(data_dlt$"1" + data_dlt$"-1" + data_dlt$"2" + data_dlt$"-2" + data_dlt$"0" + 1)
data_dlt <- mutate(data_dlt, TextType = as.factor(ifelse(grepl("Experiment",Text),"dialogue","monologue")))
data_dlt <- mutate(data_dlt, Role = as.factor(ifelse(grepl("Follower",Text),"hearer","speaker")))
data_dlt$Age_Group<-c("Low", "High")[
findInterval(data_dlt$Age , c(-Inf, 40, Inf) ) ]
data_dlt$Text <- as.factor(data_dlt$Text)
data_dlt$Sex <- as.factor(data_dlt$Sex)
data_dlt$Age_Group <- as.factor(data_dlt$Age_Group)
Data_counted <- data_ug
summary(model.0 <- lmer(X1prop ~ Sex*Age + Place + TextType + (1 | Speaker_ID), data = Data_counted, control = lmerControl(optimizer = "bobyqa")))
Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: X1prop ~ Sex * Age + Place + TextType + (1 | Speaker_ID)
Data: Data_counted
Control: lmerControl(optimizer = "bobyqa")
REML criterion at convergence: -548.9
Scaled residuals:
Min 1Q Median 3Q Max
-4.9164 -0.2972 0.0790 0.5523 2.7411
Random effects:
Groups Name Variance Std.Dev.
Speaker_ID (Intercept) 0.01733 0.1317
Residual 0.03013 0.1736
Number of obs: 1007, groups: Speaker_ID, 26
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 0.601353 0.114265 19.260845 5.263 4.25e-05 ***
SexM 0.352244 0.158879 19.106768 2.217 0.0389 *
Age 0.003763 0.002866 19.163845 1.313 0.2048
PlaceMoscow -0.020298 0.120736 18.997443 -0.168 0.8683
PlaceNakhodka -0.077046 0.071215 18.961099 -1.082 0.2929
PlaceNovosibirsk -0.124351 0.067907 19.081400 -1.831 0.0827 .
TextTypemonologue 0.028922 0.011270 980.597942 2.566 0.0104 *
SexM:Age -0.003494 0.003908 19.082356 -0.894 0.3825
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr) SexM Age PlcMsc PlcNkh PlcNvs TxtTyp
SexM -0.684
Age -0.872 0.630
PlaceMoscow -0.581 0.385 0.410
PlaceNakhdk -0.117 0.049 -0.206 0.196
PlacNvsbrsk -0.228 0.018 -0.081 0.249 0.510
TextTypmnlg -0.057 -0.002 -0.004 -0.003 0.002 0.010
SexM:Age 0.657 -0.938 -0.693 -0.334 -0.043 -0.016 0.001
drop1(model.0, test = "Chisq") # The Sex:Age interaction can be dropped
Single term deletions using Satterthwaite's method:
Model:
X1prop ~ Sex * Age + Place + TextType + (1 | Speaker_ID)
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
Place 0.104865 0.034955 3 19.03 1.1603 0.35075
TextType 0.198389 0.198389 1 980.60 6.5855 0.01043 *
Sex:Age 0.024074 0.024074 1 19.08 0.7991 0.38249
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
summary(model.1 <- lmer(X1prop ~ Sex + Age + Place + TextType + (1 | Speaker_ID), data = Data_counted, control = lmerControl(optimizer = "bobyqa")))
Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: X1prop ~ Sex + Age + Place + TextType + (1 | Speaker_ID)
Data: Data_counted
Control: lmerControl(optimizer = "bobyqa")
REML criterion at convergence: -557.4
Scaled residuals:
Min 1Q Median 3Q Max
-4.9164 -0.3001 0.0776 0.5515 2.7325
Random effects:
Groups Name Variance Std.Dev.
Speaker_ID (Intercept) 0.01714 0.1309
Residual 0.03013 0.1736
Number of obs: 1007, groups: Speaker_ID, 26
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 0.668450 0.085700 20.251928 7.800 1.58e-07 ***
SexM 0.219035 0.054830 19.993714 3.995 0.000712 ***
Age 0.001988 0.002056 20.062626 0.967 0.345174
PlaceMoscow -0.056337 0.113195 19.929449 -0.498 0.624142
PlaceNakhodka -0.079783 0.070766 19.929332 -1.127 0.272955
PlaceNovosibirsk -0.125299 0.067534 20.055830 -1.855 0.078307 .
TextTypemonologue 0.028935 0.011270 980.580564 2.567 0.010395 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr) SexM Age PlcMsc PlcNkh PlcNvs
SexM -0.258
Age -0.766 -0.078
PlaceMoscow -0.509 0.220 0.263
PlaceNakhdk -0.118 0.025 -0.327 0.193
PlacNvsbrsk -0.289 0.008 -0.127 0.259 0.510
TextTypmnlg -0.077 -0.004 -0.005 -0.002 0.002 0.010
drop1(model.1, test = "Chisq") # Place can be dropped
Single term deletions using Satterthwaite's method:
Model:
X1prop ~ Sex + Age + Place + TextType + (1 | Speaker_ID)
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
Sex 0.48077 0.48077 1 19.99 15.9587 0.0007124 ***
Age 0.02816 0.02816 1 20.06 0.9346 0.3451740
Place 0.10505 0.03502 3 19.99 1.1623 0.3487780
TextType 0.19857 0.19857 1 980.58 6.5913 0.0103952 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
summary(model.2 <- lmer(X1prop ~ Sex + Age + TextType + (1 | Speaker_ID), data = Data_counted, control = lmerControl(optimizer = "bobyqa")))
Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: X1prop ~ Sex + Age + TextType + (1 | Speaker_ID)
Data: Data_counted
Control: lmerControl(optimizer = "bobyqa")
REML criterion at convergence: -563.8
Scaled residuals:
Min 1Q Median 3Q Max
-4.9182 -0.2901 0.0731 0.5555 2.7175
Random effects:
Groups Name Variance Std.Dev.
Speaker_ID (Intercept) 0.01750 0.1323
Residual 0.03013 0.1736
Number of obs: 1007, groups: Speaker_ID, 26
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 6.230e-01 7.294e-02 2.339e+01 8.541 1.2e-08 ***
SexM 2.202e-01 5.395e-02 2.295e+01 4.082 0.000461 ***
Age 1.372e-03 1.837e-03 2.299e+01 0.747 0.462869
TextTypemonologue 2.912e-02 1.127e-02 9.808e+02 2.584 0.009902 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr) SexM Age
SexM -0.188
Age -0.863 -0.166
TextTypmnlg -0.089 -0.003 -0.004
drop1(model.2, test = "Chisq") # Age can be dropped
Single term deletions using Satterthwaite's method:
Model:
X1prop ~ Sex + Age + TextType + (1 | Speaker_ID)
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
Sex 0.50198 0.50198 1 22.95 16.6625 0.0004606 ***
Age 0.01679 0.01679 1 22.99 0.5574 0.4628685
TextType 0.20120 0.20120 1 980.78 6.6784 0.0099025 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
anova(model.1, model.2) # No significant difference between the models
Data: Data_counted
Models:
model.2: X1prop ~ Sex + Age + TextType + (1 | Speaker_ID)
model.1: X1prop ~ Sex + Age + Place + TextType + (1 | Speaker_ID)
Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
model.2 6 -579.36 -549.87 295.68 -591.36
model.1 9 -577.54 -533.31 297.77 -595.54 4.1766 3 0.243
summary(model.3 <- lmer(X1prop ~ Sex + TextType + (1 | Speaker_ID), data = Data_counted, control = lmerControl(optimizer = "bobyqa")))
Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: X1prop ~ Sex + TextType + (1 | Speaker_ID)
Data: Data_counted
Control: lmerControl(optimizer = "bobyqa")
REML criterion at convergence: -574
Scaled residuals:
Min 1Q Median 3Q Max
-4.9178 -0.2932 0.0758 0.5578 2.7112
Random effects:
Groups Name Variance Std.Dev.
Speaker_ID (Intercept) 0.01716 0.1310
Residual 0.03013 0.1736
Number of obs: 1007, groups: Speaker_ID, 26
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 0.67000 0.03645 25.71047 18.381 2.62e-16 ***
SexM 0.22691 0.05269 23.92080 4.306 0.000244 ***
TextTypemonologue 0.02916 0.01127 980.80429 2.588 0.009803 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr) SexM
SexM -0.667
TextTypmnlg -0.186 -0.004
drop1(model.3, test = "Chisq") # Nothing can be dropped, still test against a simpler model
Single term deletions using Satterthwaite's method:
Model:
X1prop ~ Sex + TextType + (1 | Speaker_ID)
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
Sex 0.55865 0.55865 1 23.92 18.5432 0.000244 ***
TextType 0.20174 0.20174 1 980.80 6.6964 0.009803 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
summary(model.4 <- lmer(X1prop ~ Sex + (1 | Speaker_ID), data = Data_counted, control = lmerControl(optimizer = "bobyqa")))
Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: X1prop ~ Sex + (1 | Speaker_ID)
Data: Data_counted
Control: lmerControl(optimizer = "bobyqa")
REML criterion at convergence: -574.5
Scaled residuals:
Min 1Q Median 3Q Max
-5.0087 -0.2960 0.0857 0.5752 2.6748
Random effects:
Groups Name Variance Std.Dev.
Speaker_ID (Intercept) 0.01728 0.1314
Residual 0.03030 0.1741
Number of obs: 1007, groups: Speaker_ID, 26
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 0.68751 0.03594 23.96336 19.129 5.08e-16 ***
SexM 0.22740 0.05288 23.91599 4.301 0.000247 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr)
SexM -0.680
drop1(model.4, test = "Chisq")
Single term deletions using Satterthwaite's method:
Model:
X1prop ~ Sex + (1 | Speaker_ID)
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
Sex 0.56036 0.56036 1 23.916 18.496 0.0002475 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
anova(model.3, model.4) # Model 3 is significantly better, so use it as the final model
Data: Data_counted
Models:
model.4: X1prop ~ Sex + (1 | Speaker_ID)
model.3: X1prop ~ Sex + TextType + (1 | Speaker_ID)
Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
model.4 4 -576.05 -556.39 292.02 -584.05
model.3 5 -580.74 -556.17 295.37 -590.74 6.6946 1 0.00967 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
plot(allEffects(model.3))
Data_counted <- data_dlt
summary(model.0 <- lmer(X0prop ~ Sex*Age + Place + Role + (1 | Speaker_ID), data = Data_counted, control = lmerControl(optimizer = "bobyqa")))
Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: X0prop ~ Sex * Age + Place + Role + (1 | Speaker_ID)
Data: Data_counted
Control: lmerControl(optimizer = "bobyqa")
REML criterion at convergence: -4613.3
Scaled residuals:
Min 1Q Median 3Q Max
-6.8432 -0.2343 0.1717 0.4730 2.5043
Random effects:
Groups Name Variance Std.Dev.
Speaker_ID (Intercept) 0.0001869 0.01367
Residual 0.0005205 0.02281
Number of obs: 1007, groups: Speaker_ID, 26
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 9.508e-01 1.238e-02 2.162e+01 76.796 <2e-16 ***
SexM 4.117e-02 1.673e-02 1.917e+01 2.461 0.0235 *
Age 5.586e-04 3.018e-04 1.925e+01 1.851 0.0796 .
PlaceMoscow -1.748e-02 1.270e-02 1.902e+01 -1.376 0.1848
PlaceNakhodka -1.020e-02 7.499e-03 1.905e+01 -1.360 0.1899
PlaceNovosibirsk -7.467e-03 7.146e-03 1.912e+01 -1.045 0.3091
Rolespeaker -3.137e-03 3.295e-03 9.984e+02 -0.952 0.3412
SexM:Age -6.060e-04 4.114e-04 1.913e+01 -1.473 0.1569
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr) SexM Age PlcMsc PlcNkh PlcNvs Rlspkr
SexM -0.670
Age -0.843 0.630
PlaceMoscow -0.573 0.386 0.410
PlaceNakhdk -0.127 0.050 -0.207 0.198
PlacNvsbrsk -0.216 0.017 -0.081 0.248 0.508
Rolespeaker -0.243 0.023 -0.016 0.033 0.057 -0.016
SexM:Age 0.644 -0.938 -0.693 -0.335 -0.044 -0.015 -0.023
drop1(model.0) # Place can be dropped
Single term deletions using Satterthwaite's method:
Model:
X0prop ~ Sex * Age + Place + Role + (1 | Speaker_ID)
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
Place 0.00165090 0.00055030 3 19.12 1.0573 0.3903
Role 0.00047193 0.00047193 1 998.40 0.9067 0.3412
Sex:Age 0.00112980 0.00112980 1 19.13 2.1706 0.1569
summary(model.1 <- lmer(X0prop ~ Sex*Age + Role + (1 | Speaker_ID), data = Data_counted, control = lmerControl(optimizer = "bobyqa")))
Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: X0prop ~ Sex * Age + Role + (1 | Speaker_ID)
Data: Data_counted
Control: lmerControl(optimizer = "bobyqa")
REML criterion at convergence: -4633.4
Scaled residuals:
Min 1Q Median 3Q Max
-6.8598 -0.2370 0.1719 0.4749 2.4881
Random effects:
Groups Name Variance Std.Dev.
Speaker_ID (Intercept) 0.0001883 0.01372
Residual 0.0005205 0.02281
Number of obs: 1007, groups: Speaker_ID, 26
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 9.407e-01 1.014e-02 2.592e+01 92.813 < 2e-16 ***
SexM 4.927e-02 1.542e-02 2.206e+01 3.195 0.00417 **
Age 6.270e-04 2.612e-04 2.214e+01 2.401 0.02520 *
Rolespeaker -2.846e-03 3.284e-03 9.998e+02 -0.867 0.38638
SexM:Age -7.792e-04 3.879e-04 2.204e+01 -2.009 0.05695 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr) SexM Age Rlspkr
SexM -0.608
Age -0.878 0.581
Rolespeaker -0.280 0.008 -0.021
SexM:Age 0.598 -0.930 -0.673 -0.010
drop1(model.1) # Role can be dropped
Single term deletions using Satterthwaite's method:
Model:
X0prop ~ Sex * Age + Role + (1 | Speaker_ID)
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
Role 0.00039087 0.00039087 1 999.76 0.7509 0.38638
Sex:Age 0.00210061 0.00210061 1 22.04 4.0357 0.05695 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
summary(model.2 <- lmer(X0prop ~ Sex*Age + (1 | Speaker_ID), data = Data_counted, control = lmerControl(optimizer = "bobyqa")))
Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: X0prop ~ Sex * Age + (1 | Speaker_ID)
Data: Data_counted
Control: lmerControl(optimizer = "bobyqa")
REML criterion at convergence: -4642.2
Scaled residuals:
Min 1Q Median 3Q Max
-6.9057 -0.2408 0.1747 0.4746 2.5669
Random effects:
Groups Name Variance Std.Dev.
Speaker_ID (Intercept) 0.0001848 0.01359
Residual 0.0005206 0.02282
Number of obs: 1007, groups: Speaker_ID, 26
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 0.9382139 0.0096444 22.2473439 97.281 < 2e-16 ***
SexM 0.0493740 0.0152850 22.1623549 3.230 0.00382 **
Age 0.0006223 0.0002588 22.2183259 2.405 0.02495 *
SexM:Age -0.0007826 0.0003844 22.1321434 -2.036 0.05393 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr) SexM Age
SexM -0.631
Age -0.921 0.581
SexM:Age 0.620 -0.930 -0.673
drop1(model.2) # The interaction is on the edge of significance, but still drop
Single term deletions using Satterthwaite's method:
Model:
X0prop ~ Sex * Age + (1 | Speaker_ID)
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
Sex:Age 0.0021573 0.0021573 1 22.132 4.144 0.05393 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
anova(model.1, model.2) # No significant difference between the models, select the simpler one
Data: Data_counted
Models:
model.2: X0prop ~ Sex * Age + (1 | Speaker_ID)
model.1: X0prop ~ Sex * Age + Role + (1 | Speaker_ID)
Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
model.2 6 -4678.3 -4648.8 2345.1 -4690.3
model.1 7 -4676.9 -4642.5 2345.5 -4690.9 0.6832 1 0.4085
summary(model.3 <- lmer(X0prop ~ Sex + Age + (1 | Speaker_ID), data = Data_counted, control = lmerControl(optimizer = "bobyqa")))
Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: X0prop ~ Sex + Age + (1 | Speaker_ID)
Data: Data_counted
Control: lmerControl(optimizer = "bobyqa")
REML criterion at convergence: -4652.2
Scaled residuals:
Min 1Q Median 3Q Max
-6.8891 -0.2419 0.1796 0.4756 2.5587
Random effects:
Groups Name Variance Std.Dev.
Speaker_ID (Intercept) 0.0002117 0.01455
Residual 0.0005206 0.02282
Number of obs: 1007, groups: Speaker_ID, 26
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 9.504e-01 8.065e-03 2.312e+01 117.845 < 2e-16 ***
SexM 2.044e-02 5.987e-03 2.303e+01 3.415 0.00237 **
Age 2.677e-04 2.039e-04 2.309e+01 1.313 0.20207
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr) SexM
SexM -0.189
Age -0.867 -0.166
drop1(model.3) # Age can be dropped
Single term deletions using Satterthwaite's method:
Model:
X0prop ~ Sex + Age + (1 | Speaker_ID)
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
Sex 0.0060695 0.0060695 1 23.026 11.6588 0.002371 **
Age 0.0008975 0.0008975 1 23.091 1.7241 0.202072
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
summary(model.4 <- lmer(X0prop ~ Sex + (1 | Speaker_ID), data = Data_counted, control = lmerControl(optimizer = "bobyqa")))
Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: X0prop ~ Sex + (1 | Speaker_ID)
Data: Data_counted
Control: lmerControl(optimizer = "bobyqa")
REML criterion at convergence: -4665.6
Scaled residuals:
Min 1Q Median 3Q Max
-6.8875 -0.2365 0.1742 0.4798 2.5493
Random effects:
Groups Name Variance Std.Dev.
Speaker_ID (Intercept) 0.0002183 0.01478
Residual 0.0005206 0.02282
Number of obs: 1007, groups: Speaker_ID, 26
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 0.959556 0.004073 24.059565 235.611 < 2e-16 ***
SexM 0.021753 0.005991 23.996874 3.631 0.00133 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr)
SexM -0.680
drop1(model.4) # Nothing can be dropped
Single term deletions using Satterthwaite's method:
Model:
X0prop ~ Sex + (1 | Speaker_ID)
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
Sex 0.006864 0.006864 1 23.997 13.185 0.001331 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
plot(allEffects(model.4))
Data_counted <- data_ug
summary(model.0 <- lmer(X1prop ~ Sex*Age + TextType + (1 | Place), data = Data_counted, control = lmerControl(optimizer = "bobyqa")))
Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: X1prop ~ Sex * Age + TextType + (1 | Place)
Data: Data_counted
Control: lmerControl(optimizer = "bobyqa")
REML criterion at convergence: -261.3
Scaled residuals:
Min 1Q Median 3Q Max
-4.2585 -0.3580 0.2147 0.6129 1.6948
Random effects:
Groups Name Variance Std.Dev.
Place (Intercept) 0.002685 0.05182
Residual 0.042974 0.20730
Number of obs: 1007, groups: Place, 4
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 5.622e-01 3.692e-02 1.021e+01 15.225 2.37e-08 ***
SexM 3.225e-01 3.946e-02 8.264e+02 8.173 1.12e-15 ***
Age 3.311e-03 7.129e-04 6.160e+02 4.644 4.18e-06 ***
TextTypemonologue 3.096e-02 1.342e-02 9.994e+02 2.306 0.02131 *
SexM:Age -2.898e-03 9.696e-04 8.998e+02 -2.989 0.00288 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr) SexM Age TxtTyp
SexM -0.447
Age -0.629 0.630
TextTypmnlg -0.213 -0.005 -0.013
SexM:Age 0.440 -0.939 -0.696 0.000
drop1(model.0, test = "Chisq") # Can't drop anything, all effects significant
Single term deletions using Satterthwaite's method:
Model:
X1prop ~ Sex * Age + TextType + (1 | Place)
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
TextType 0.22853 0.22853 1 999.43 5.3178 0.021313 *
Sex:Age 0.38382 0.38382 1 899.82 8.9316 0.002879 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
plot(allEffects(model.0))
Data_counted <- data_dlt
summary(model.0 <- lmer(X0prop ~ Sex*Age + TextType + (1 | Place), data = Data_counted, control = lmerControl(optimizer = "bobyqa")))
Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: X0prop ~ Sex * Age + TextType + (1 | Place)
Data: Data_counted
Control: lmerControl(optimizer = "bobyqa")
REML criterion at convergence: -4447.5
Scaled residuals:
Min 1Q Median 3Q Max
-7.4678 -0.2906 0.2355 0.5497 2.0371
Random effects:
Groups Name Variance Std.Dev.
Place (Intercept) 4.544e-05 0.006741
Residual 6.587e-04 0.025665
Number of obs: 1007, groups: Place, 4
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 9.397e-01 4.690e-03 8.352e+00 200.364 < 2e-16 ***
SexM 4.084e-02 4.892e-03 8.316e+02 8.348 2.88e-16 ***
Age 5.547e-04 8.844e-05 6.237e+02 6.271 6.68e-10 ***
TextTypemonologue 2.499e-04 1.662e-03 9.990e+02 0.150 0.88
SexM:Age -6.058e-04 1.202e-04 9.036e+02 -5.042 5.56e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr) SexM Age TxtTyp
SexM -0.437
Age -0.614 0.631
TextTypmnlg -0.208 -0.005 -0.013
SexM:Age 0.430 -0.939 -0.697 0.000
drop1(model.0, test = "Chisq")
Single term deletions using Satterthwaite's method:
Model:
X0prop ~ Sex * Age + TextType + (1 | Place)
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
TextType 0.0000149 0.0000149 1 999.05 0.0226 0.8805
Sex:Age 0.0167462 0.0167462 1 903.56 25.4241 5.56e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
summary(model.1 <- lmer(X0prop ~ Sex*Age + (1 | Place), data = Data_counted, control = lmerControl(optimizer = "bobyqa")))
Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: X0prop ~ Sex * Age + (1 | Place)
Data: Data_counted
Control: lmerControl(optimizer = "bobyqa")
REML criterion at convergence: -4458.5
Scaled residuals:
Min 1Q Median 3Q Max
-7.4679 -0.2871 0.2369 0.5495 2.0321
Random effects:
Groups Name Variance Std.Dev.
Place (Intercept) 4.542e-05 0.006739
Residual 6.580e-04 0.025652
Number of obs: 1007, groups: Place, 4
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 9.398e-01 4.586e-03 7.641e+00 204.932 1.43e-15 ***
SexM 4.084e-02 4.889e-03 8.325e+02 8.353 2.77e-16 ***
Age 5.548e-04 8.840e-05 6.246e+02 6.277 6.46e-10 ***
SexM:Age -6.058e-04 1.201e-04 9.044e+02 -5.045 5.49e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr) SexM Age
SexM -0.447
Age -0.630 0.631
SexM:Age 0.439 -0.939 -0.697
drop1(model.1, test = "Chisq")
Single term deletions using Satterthwaite's method:
Model:
X0prop ~ Sex * Age + (1 | Place)
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
Sex:Age 0.016746 0.016746 1 904.44 25.448 5.491e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
summary(model.2 <- lm(X0prop ~ Sex*Age, data = Data_counted))
Call:
lm(formula = X0prop ~ Sex * Age, data = Data_counted)
Residuals:
Min 1Q Median 3Q Max
-0.195842 -0.008086 0.007133 0.014010 0.044465
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.388e-01 2.921e-03 321.425 < 2e-16 ***
SexM 4.837e-02 4.596e-03 10.523 < 2e-16 ***
Age 6.148e-04 7.823e-05 7.859 9.95e-15 ***
SexM:Age -7.670e-04 1.152e-04 -6.658 4.56e-11 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.02609 on 1003 degrees of freedom
Multiple R-squared: 0.1868, Adjusted R-squared: 0.1843
F-statistic: 76.79 on 3 and 1003 DF, p-value: < 2.2e-16
AIC(model.1)
[1] -4446.479
AIC(model.2) #AIC of Model 2 is greater, select Model 1
[1] -4479.879
plot(allEffects(model.1))
In the next two moels we do not run model selection, because we’re interested in the top-level interaction. Instead, we simply explore the significance levels of the effects and their combinations.
Data_counted <- data_dlt
Data_counted <- Data_counted[Data_counted$Place %nin% c("Moscow"),]
Data_counted$Speaker_ID <- factor(Data_counted$Speaker_ID)
Data_counted$Place <- factor(Data_counted$Place)
summary(model.0 <- lm(X0prop ~ Place:Age:Sex, data = Data_counted))
Call:
lm(formula = X0prop ~ Place:Age:Sex, data = Data_counted)
Residuals:
Min 1Q Median 3Q Max
-0.183488 -0.007275 0.004156 0.016106 0.038143
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.625e-01 2.408e-03 399.721 < 2e-16 ***
PlaceKrasnoyarsk:Age:SexF 2.232e-04 8.444e-05 2.643 0.00835 **
PlaceNakhodka:Age:SexF 1.287e-04 7.128e-05 1.806 0.07131 .
PlaceNovosibirsk:Age:SexF -1.182e-04 7.098e-05 -1.665 0.09617 .
PlaceKrasnoyarsk:Age:SexM 6.664e-04 8.475e-05 7.863 1.04e-14 ***
PlaceNakhodka:Age:SexM 1.821e-04 6.450e-05 2.824 0.00485 **
PlaceNovosibirsk:Age:SexM 4.938e-04 6.743e-05 7.323 5.29e-13 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.02382 on 920 degrees of freedom
Multiple R-squared: 0.151, Adjusted R-squared: 0.1454
F-statistic: 27.27 on 6 and 920 DF, p-value: < 2.2e-16
plot(allEffects(model.0))
Data_counted <- data_ug
Data_counted <- Data_counted[Data_counted$Place %nin% c("Moscow"),]
Data_counted$Speaker_ID <- factor(Data_counted$Speaker_ID)
Data_counted$Place <- factor(Data_counted$Place)
summary(model.0 <- lm(X1prop ~ Place:Sex:Age, data = Data_counted))
Call:
lm(formula = X1prop ~ Place:Sex:Age, data = Data_counted)
Residuals:
Min 1Q Median 3Q Max
-0.80547 -0.06638 0.01743 0.14479 0.48352
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.7049221 0.0200658 35.130 < 2e-16 ***
PlaceKrasnoyarsk:SexF:Age 0.0024164 0.0007036 3.434 0.000621 ***
PlaceNakhodka:SexF:Age 0.0016829 0.0005940 2.833 0.004712 **
PlaceNovosibirsk:SexF:Age -0.0034450 0.0005915 -5.824 7.92e-09 ***
PlaceKrasnoyarsk:SexM:Age 0.0070117 0.0007063 9.927 < 2e-16 ***
PlaceNakhodka:SexM:Age 0.0027175 0.0005375 5.056 5.17e-07 ***
PlaceNovosibirsk:SexM:Age 0.0054400 0.0005620 9.680 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1985 on 920 degrees of freedom
Multiple R-squared: 0.2973, Adjusted R-squared: 0.2927
F-statistic: 64.88 on 6 and 920 DF, p-value: < 2.2e-16
plot(allEffects(model.0))